0. Load data

In the initial ‘Data preparation’ step, we’ve added an extra ‘Age’ column. This column will be used to perform mean-difference hypothesis test between two groups. Some example data with the added ‘Age’ column:

# Load data
data <- readRDS("../../output/step_1/finalData.RDS")
dataMatrix <- readRDS("../../output/step_1/dataMatrix.RDS")
keyInfo <- readRDS("../../output/step_1/keyInfo.RDS")
metaInfo <- readRDS("../../output/step_1/metaInfo.RDS")

# Add 'Age' column
matchTable <- data.frame("age"=c("60", "44", "31", "25", "25", "43"), 
                         "SampleID"=c("105", "218", "261", "043", "160", "149"),
                         stringsAsFactors=FALSE)

keyInfo <- merge(keyInfo, matchTable, all.x=TRUE)


# Example data
kable(keyInfo, "html") %>%
      kable_styling(font_size=10) %>%
      kable_styling("striped") %>%
      scroll_box(width="100%")
SampleID Sample_Name Slide Array Basename CellTypeLong CellType Sex age
043 Neu_043 5727920038 R04C01 idat/5727920038_R04C01 Neutrophils Neu M 25
043 CD19+_043 5727920033 R04C01 idat/5727920033_R04C01 CD19+ B-cells Bcell M 25
043 PBMC_043 5727920027 R04C01 idat/5727920027_R04C01 PBMC PBMC M 25
043 CD4+_043 5727920027 R04C02 idat/5727920027_R04C02 CD4+ T-cells CD4T M 25
043 CD14+_043 5727920033 R01C02 idat/5727920033_R01C02 CD14+ Monocytes Mono M 25
043 Eos_043 5727920038 R04C02 idat/5727920038_R04C02 Eosinophils Eos M 25
043 CD56+_043 5727920033 R04C02 idat/5727920033_R04C02 CD56+ NK-cells NK M 25
043 WB_043 5727920027 R01C01 idat/5727920027_R01C01 Whole blood WBC M 25
043 Gran_043 5727920027 R01C02 idat/5727920027_R01C02 Granulocytes Gran M 25
043 CD8+_043 5727920033 R01C01 idat/5727920033_R01C01 CD8+ T-cells CD8T M 25
105 WB_105 5684819001 R01C01 idat/5684819001_R01C01 Whole blood WBC M 60
105 CD4+_105 5684819001 R04C02 idat/5684819001_R04C02 CD4+ T-cells CD4T M 60
105 CD14+_105 5684819004 R01C02 idat/5684819004_R01C02 CD14+ Monocytes Mono M 60
105 PBMC_105 5684819001 R04C01 idat/5684819001_R04C01 PBMC PBMC M 60
105 Neu_105 5727920038 R01C01 idat/5727920038_R01C01 Neutrophils Neu M 60
105 Gran_105 5684819001 R01C02 idat/5684819001_R01C02 Granulocytes Gran M 60
105 Eos_105 5727920038 R01C02 idat/5727920038_R01C02 Eosinophils Eos M 60
149 CD56+_149 5727920033 R06C02 idat/5727920033_R06C02 CD56+ NK-cells NK M 43
149 CD8+_149 5727920033 R03C01 idat/5727920033_R03C01 CD8+ T-cells CD8T M 43
149 CD4+_149 5727920027 R06C02 idat/5727920027_R06C02 CD4+ T-cells CD4T M 43
149 Neu_149 5727920038 R06C01 idat/5727920038_R06C01 Neutrophils Neu M 43
149 CD19+_149 5727920033 R06C01 idat/5727920033_R06C01 CD19+ B-cells Bcell M 43
149 PBMC_149 5727920027 R06C01 idat/5727920027_R06C01 PBMC PBMC M 43
149 WB_149 5727920027 R03C01 idat/5727920027_R03C01 Whole blood WBC M 43
149 CD14+_149 5727920033 R03C02 idat/5727920033_R03C02 CD14+ Monocytes Mono M 43
149 Gran_149 5727920027 R03C02 idat/5727920027_R03C02 Granulocytes Gran M 43
149 Eos_149 5727920038 R06C02 idat/5727920038_R06C02 Eosinophils Eos M 43
160 CD4+_160 5727920027 R05C02 idat/5727920027_R05C02 CD4+ T-cells CD4T M 25
160 CD19+_160 5727920033 R05C01 idat/5727920033_R05C01 CD19+ B-cells Bcell M 25
160 WB_160 5727920027 R02C01 idat/5727920027_R02C01 Whole blood WBC M 25
160 Neu_160 5727920038 R05C01 idat/5727920038_R05C01 Neutrophils Neu M 25
160 Gran_160 5727920027 R02C02 idat/5727920027_R02C02 Granulocytes Gran M 25
160 PBMC_160 5727920027 R05C01 idat/5727920027_R05C01 PBMC PBMC M 25
160 CD56+_160 5727920033 R05C02 idat/5727920033_R05C02 CD56+ NK-cells NK M 25
160 CD14+_160 5727920033 R02C02 idat/5727920033_R02C02 CD14+ Monocytes Mono M 25
160 Eos_160 5727920038 R05C02 idat/5727920038_R05C02 Eosinophils Eos M 25
160 CD8+_160 5727920033 R02C01 idat/5727920033_R02C01 CD8+ T-cells CD8T M 25
218 WB_218 5684819001 R02C01 idat/5684819001_R02C01 Whole blood WBC M 44
218 Gran_218 5684819001 R02C02 idat/5684819001_R02C02 Granulocytes Gran M 44
218 CD8+_218 5684819004 R02C01 idat/5684819004_R02C01 CD8+ T-cells CD8T M 44
218 Neu_218 5727920038 R02C01 idat/5727920038_R02C01 Neutrophils Neu M 44
218 Eos_218 5727920038 R02C02 idat/5727920038_R02C02 Eosinophils Eos M 44
218 CD14+_218 5684819004 R02C02 idat/5684819004_R02C02 CD14+ Monocytes Mono M 44
218 PBMC_218 5684819001 R05C01 idat/5684819001_R05C01 PBMC PBMC M 44
218 CD19+_218 5684819004 R05C01 idat/5684819004_R05C01 CD19+ B-cells Bcell M 44
218 CD4+_218 5684819001 R05C02 idat/5684819001_R05C02 CD4+ T-cells CD4T M 44
261 Gran_261 5684819001 R03C02 idat/5684819001_R03C02 Granulocytes Gran M 31
261 CD8+_261 5684819004 R03C01 idat/5684819004_R03C01 CD8+ T-cells CD8T M 31
261 PBMC_261 5684819001 R06C01 idat/5684819001_R06C01 PBMC PBMC M 31
261 WB_261 5684819001 R03C01 idat/5684819001_R03C01 Whole blood WBC M 31
261 CD4+_261 5684819001 R06C02 idat/5684819001_R06C02 CD4+ T-cells CD4T M 31
261 Eos_261 5727920038 R03C02 idat/5727920038_R03C02 Eosinophils Eos M 31
261 CD14+_261 5684819004 R03C02 idat/5684819004_R03C02 CD14+ Monocytes Mono M 31
261 Neu_261 5727920038 R03C01 idat/5727920038_R03C01 Neutrophils Neu M 31
261 CD56+_261 5684819004 R06C02 idat/5684819004_R06C02 CD56+ NK-cells NK M 31
# Remove redundant data
rm(matchTable)

1. Perform a simple mean-difference hypothesis test between two groups.

In this step we’ve chosen two age groups - people aged below 35, and people older than 35. Student’s t-test is used to determine whether the means of two groups are equal to each other. The dataset contains 3 people that are younger than 35 (25, 25, 31) and 3 people that are older (43, 44, 60).

# Perform t.test on data
ttest <- apply(dataMatrix, 
              1, 
              function(x) t.test(x[keyInfo$age < 35], x[keyInfo$age >= 35]))

# Obtain p-values
pValues <- data.frame(pValue = sapply(ttest, function(x) x$p.value),
                      effectSize = sapply(ttest, function(x) diff(x$estimate)))

2. Visualize top 5 most significant methylation positions (rows)

Since there is a global loss of DNA methylation during aging, we can see in some sites that people over 35 have a smaller methylation level. However, these differences are minimal, and in most plots the methylation level is similar.

# Separate data in two groups by age
firstGroup <- keyInfo[keyInfo$age < 35, ]$Sample_Name
secondGroup <- keyInfo[keyInfo$age >= 35, ]$Sample_Name

# Obtain most significant 5 rows
topRows <- pValues[order(pValues$pValue), ]
topRows <- head(topRows, 5)

# Obtain these rows from dataMatrix
topRowsData <- dataMatrix[match(rownames(topRows), rownames(dataMatrix)), 
                          order(keyInfo$age)]

# Change the column names for factorization later
for(i in 1:55) {
  if(colnames(topRowsData)[i] %in% firstGroup) {
    colnames(topRowsData)[i] <- "first"
  }
  else {
    colnames(topRowsData)[i] <- "second"
  }
}

# Generate plots
for(i in 1:5) {
  plot(
    topRowsData[i, ], 
    main=rownames(topRowsData)[i],
    ylab="Methylation level", 
    col=as.factor(colnames(topRowsData))
  )
  legend("topright", 
         inset=.02, 
         title="Age", 
         c("< 35", ">= 35"), 
         fill=c("black", "red"), 
         horiz=TRUE, 
         cex=0.8)
}

# Remove redundant data
rm(firstGroup)
rm(secondGroup)
rm(topRows)
rm(i)

3. Provide a table with number of significant rows at the following levels

# Produce a table from dataframe
valuesTable <- data.frame(
  "levels"          = c("alpha = 0.1", "alpha = 0.05",
                        "alpha = 0.01", "FDR correction = 0.05", 
                        "Bonferroni correction = 0.05"),
  "significantRows" = c(sum(pValues$pValue<0.1),
                        sum(pValues$pValue<0.05),
                        sum(pValues$pValue<0.01),
                        sum(p.adjust(pValues$pValue, 
                                    method="fdr")<0.05),
                        sum(p.adjust(pValues$pValue,
                                    method="bonferroni")<0.05)
                      )
)

# Generate the table
kable(valuesTable, "html") %>%
      kable_styling(font_size=10) %>%
      kable_styling("striped") %>%
      scroll_box(width="100%")
levels significantRows
alpha = 0.1 53307
alpha = 0.05 29068
alpha = 0.01 6159
FDR correction = 0.05 0
Bonferroni correction = 0.05 0

4. Visualize the histogram

Our p-values appear to be uniformly distributed.

hist(
  pValues$pValue,
  main="p-values",
  xlab="p-value",
  col=c("gray", "lightpink")
)

5. Visualize the volcano-plot

A volcano plot combines a measure of statistical significance from p-values with the magnitude of the change. There aren’t any points that are found toward the top of the plot that are far to either the left or right-hand sides. This indicates that there isn’t a value that displays large magnitude fold changes as well as high statistical significance.

plot(
  pValues$effectSize,
  -log(pValues$pValue),
  main="Volcano plot",
    xlab="Effect size",
  ylab="p-value log",
  col=c("gray", "lightpink")
)

6. Visualize a manhattan plot

For this step we chose chrX. The strongest associations have the smallest p-values, hence their negative logarithms are the greatest.

plot(
  x=metaInfo$pos[metaInfo$chr=="chrX"],
  y=-log10(pValues$pValue[metaInfo$chr=="chrX"]),
  main="Manhattan plot",
  xlab="Chromosome X positions",
  ylab="-log10",
  col=c("gray", "lightpink")
)

7. Perform the first step using a linear model ???